From 4e4051276be9219ee0801010fee53a35483801a1 Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Wed, 23 Mar 2005 13:35:24 +0000 Subject: [PATCH] bitkeeper revision 1.1236.1.117 (4241709cmfgF-94U74o-PmIJUmoksQ) Fix iopl/iobitmap interfaces. Control tools can set per-domain access limits via DOM0_IOPL_PERMISSION/DOM0_IOPORT_PERMISSION. Guests can set current permissions per VCPU via physdev ops. These will always succeed -- checking against admin-set limits is done at access time. Signed-off-by: Keir Fraser --- .rootkeys | 1 + .../arch/xen/kernel/ioport.c | 43 ++++ .../arch/xen/kernel/process.c | 15 +- .../arch/xen/kernel/setup.c | 17 +- linux-2.4.29-xen-sparse/mkbuildtree | 1 - .../arch/xen/i386/kernel/ioport.c | 116 ++++++++-- .../arch/xen/i386/kernel/process.c | 75 ++----- .../arch/xen/i386/kernel/setup.c | 20 +- xen/arch/x86/dom0_ops.c | 55 ++++- xen/arch/x86/domain.c | 47 +--- xen/arch/x86/setup.c | 1 - xen/arch/x86/traps.c | 75 +++++-- xen/common/domain.c | 7 + xen/common/physdev.c | 202 +++++++----------- xen/include/asm-x86/domain.h | 11 +- xen/include/asm-x86/processor.h | 8 +- xen/include/public/dom0_ops.h | 26 ++- xen/include/public/physdev.h | 17 ++ 18 files changed, 431 insertions(+), 306 deletions(-) create mode 100644 linux-2.4.29-xen-sparse/arch/xen/kernel/ioport.c diff --git a/.rootkeys b/.rootkeys index c15234cd2d..64f3c714b0 100644 --- a/.rootkeys +++ b/.rootkeys @@ -145,6 +145,7 @@ 3e5a4e65_hqfuxtGG8IUy6wRM86Ecg linux-2.4.29-xen-sparse/arch/xen/kernel/entry.S 3e5a4e65Hy_1iUvMTPsNqGNXd9uFpg linux-2.4.29-xen-sparse/arch/xen/kernel/head.S 3e5a4e65RMGcuA-HCn3-wNx3fFQwdg linux-2.4.29-xen-sparse/arch/xen/kernel/i386_ksyms.c +4241709bNBs1q4Ss32YW0CyFVOGhEg linux-2.4.29-xen-sparse/arch/xen/kernel/ioport.c 3e5a4e653U6cELGv528IxOLHvCq8iA linux-2.4.29-xen-sparse/arch/xen/kernel/irq.c 3e5a4e65muT6SU3ck47IP87Q7Ti5hA linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c 4051db95N9N99FjsRwi49YKUNHWI8A linux-2.4.29-xen-sparse/arch/xen/kernel/pci-pc.c diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/ioport.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/ioport.c new file mode 100644 index 0000000000..4a716e0164 --- /dev/null +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/ioport.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage long sys_iopl(unsigned int new_io_pl) +{ + unsigned int old_io_pl = current->thread.io_pl; + physdev_op_t op; + + if (new_io_pl > 3) + return -EINVAL; + + /* Need "raw I/O" privileges for direct port access. */ + if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* Maintain OS privileges even if user attempts to relinquish them. */ + if (new_io_pl == 0) + new_io_pl = 1; + + /* Change our version of the privilege levels. */ + current->thread.io_pl = new_io_pl; + + /* Force the change at ring 0. */ + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = new_io_pl; + HYPERVISOR_physdev_op(&op); + + return 0; +} + +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + return turn_on ? sys_iopl(3) : 0; +} diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/process.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/process.c index d8ef0b95d8..bc97ceeac7 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/process.c +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/process.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include @@ -304,6 +304,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *next = &next_p->thread; + physdev_op_t op; __cli(); @@ -335,14 +336,12 @@ void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p } queue_multicall2(__HYPERVISOR_stack_switch, __KERNEL_DS, next->esp0); - if ( xen_start_info.flags & SIF_PRIVILEGED ) + + if ( prev_p->thread.io_pl != next->io_pl ) { - dom0_op_t op; - op.cmd = DOM0_IOPL; - op.u.iopl.domain = DOMID_SELF; - op.u.iopl.iopl = next->io_pl; - op.interface_version = DOM0_INTERFACE_VERSION; - queue_multicall1(__HYPERVISOR_dom0_op, (unsigned long)&op); + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = next->io_pl; + queue_multicall1(__HYPERVISOR_physdev_op, (unsigned long)&op); } /* EXECUTE ALL TASK SWITCH XEN SYSCALLS AT THIS POINT. */ diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/setup.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/setup.c index 742cff802d..63317263ee 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/setup.c +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/setup.c @@ -48,7 +48,7 @@ static int errno; #include #include #include -#include +#include #include #include #include @@ -206,6 +206,7 @@ void __init setup_arch(char **cmdline_p) unsigned long bootmap_size, start_pfn, lmax_low_pfn; int mem_param; /* user specified memory size in pages */ int boot_pfn; /* low pages available for bootmem */ + physdev_op_t op; extern void hypervisor_callback(void); extern void failsafe_callback(void); @@ -416,17 +417,9 @@ void __init setup_arch(char **cmdline_p) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; - /* If we are a privileged guest OS then we should request IO privileges. */ - if ( xen_start_info.flags & SIF_PRIVILEGED ) - { - dom0_op_t op; - op.cmd = DOM0_IOPL; - op.u.iopl.domain = DOMID_SELF; - op.u.iopl.iopl = 1; - if( HYPERVISOR_dom0_op(&op) != 0 ) - panic("Unable to obtain IOPL, despite being SIF_PRIVILEGED"); - current->thread.io_pl = 1; - } + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = current->thread.io_pl = 1; + HYPERVISOR_physdev_op(&op); if (xen_start_info.flags & SIF_INITDOMAIN ) { diff --git a/linux-2.4.29-xen-sparse/mkbuildtree b/linux-2.4.29-xen-sparse/mkbuildtree index 4c3ebfe05e..15707041d0 100755 --- a/linux-2.4.29-xen-sparse/mkbuildtree +++ b/linux-2.4.29-xen-sparse/mkbuildtree @@ -232,7 +232,6 @@ ln -sf ../../../${LINUX_26}/arch/xen/kernel/fixup.c ln -sf ../../../${LINUX_26}/arch/xen/kernel/gnttab.c ln -sf ../../../${LINUX_26}/arch/xen/kernel/reboot.c ln -sf ../../../${LINUX_26}/arch/xen/kernel/skbuff.c -ln -sf ../../../${LINUX_26}/arch/xen/i386/kernel/ioport.c ln -sf ../../../${LINUX_26}/arch/xen/i386/kernel/pci-dma.c cd ${AD}/arch/xen/lib diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ioport.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ioport.c index 89c1c7e38f..3aa6c5a4cf 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ioport.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ioport.c @@ -1,19 +1,110 @@ +/* + * linux/arch/i386/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + #include #include #include #include #include -#include #include #include #include #include -#include +#include +#include + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + unsigned long mask; + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); + unsigned int low_index = base & (BITS_PER_LONG-1); + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0UL << low_index); + if (length < BITS_PER_LONG) + mask &= ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= BITS_PER_LONG; + } + + mask = (new_value ? ~0UL : 0UL); + while (length >= BITS_PER_LONG) { + *bitmap_base++ = mask; + length -= BITS_PER_LONG; + } + + if (length > 0) { + mask = ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = ¤t->thread; + unsigned long *bitmap; + physdev_op_t op; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + + op.cmd = PHYSDEVOP_SET_IOBITMAP; + op.u.set_iobitmap.bitmap = (unsigned long)bitmap; + op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS; + HYPERVISOR_physdev_op(&op); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ asmlinkage long sys_iopl(unsigned int new_io_pl) { unsigned int old_io_pl = current->thread.io_pl; - dom0_op_t op; + physdev_op_t op; if (new_io_pl > 3) return -EINVAL; @@ -22,9 +113,6 @@ asmlinkage long sys_iopl(unsigned int new_io_pl) if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) return -EPERM; - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - return -EPERM; - /* Maintain OS privileges even if user attempts to relinquish them. */ if (new_io_pl == 0) new_io_pl = 1; @@ -33,19 +121,9 @@ asmlinkage long sys_iopl(unsigned int new_io_pl) current->thread.io_pl = new_io_pl; /* Force the change at ring 0. */ - op.cmd = DOM0_IOPL; - op.u.iopl.domain = DOMID_SELF; - op.u.iopl.iopl = new_io_pl; - HYPERVISOR_dom0_op(&op); + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = new_io_pl; + HYPERVISOR_physdev_op(&op); return 0; } - -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ -#if 0 - printk(KERN_INFO "ioperm not fully supported - %s\n", - turn_on ? "set iopl to 3" : "ignore resource release"); -#endif - return turn_on ? sys_iopl(3) : 0; -} diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c index ce5ab25a4e..fdfc1f6207 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -228,20 +228,11 @@ void exit_thread(void) /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - + physdev_op_t op = { 0 }; + op.cmd = PHYSDEVOP_SET_IOBITMAP; + HYPERVISOR_physdev_op(&op); kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); - t->io_bitmap_max = 0; - tss->io_bitmap_owner = NULL; - tss->io_bitmap_max = 0; - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; - put_cpu(); } } @@ -412,37 +403,6 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) return 1; } -static inline void -handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) -{ - if (!next->io_bitmap_ptr) { - /* - * Disable the bitmap via an invalid offset. We still cache - * the previous bitmap owner and the IO bitmap contents: - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; - return; - } - if (likely(next == tss->io_bitmap_owner)) { - /* - * Previous owner of the bitmap (hence the bitmap content) - * matches the next task, we dont have to do anything but - * to set a valid offset in the TSS: - */ - tss->io_bitmap_base = IO_BITMAP_OFFSET; - return; - } - /* - * Lazy TSS's I/O bitmap copy. We set an invalid offset here - * and we let the task to get a GPF in case an I/O instruction - * is performed. The handler of the GPF will verify that the - * faulting task has a valid I/O bitmap and, it true, does the - * real copy and restart the instruction. This will save us - * redundant copies when the currently switched task does not - * perform any I/O during its timeslice. - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; -} /* * This special macro can be used to load a debugging register */ @@ -483,7 +443,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); - dom0_op_t op; + physdev_op_t iopl_op, iobmp_op; /* NB. No need to disable interrupts as already done in sched.c */ /* __cli(); */ @@ -540,12 +500,22 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas C(0); C(1); C(2); #undef C - if (xen_start_info.flags & SIF_PRIVILEGED) { - op.cmd = DOM0_IOPL; - op.u.iopl.domain = DOMID_SELF; - op.u.iopl.iopl = next->io_pl; - op.interface_version = DOM0_INTERFACE_VERSION; - queue_multicall1(__HYPERVISOR_dom0_op, (unsigned long)&op); + if (unlikely(prev->io_pl != next->io_pl)) { + iopl_op.cmd = PHYSDEVOP_SET_IOPL; + iopl_op.u.set_iopl.iopl = next->io_pl; + queue_multicall1(__HYPERVISOR_physdev_op, + (unsigned long)&iopl_op); + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + iobmp_op.cmd = + PHYSDEVOP_SET_IOBITMAP; + iobmp_op.u.set_iobitmap.bitmap = + (unsigned long)next->io_bitmap_ptr; + iobmp_op.u.set_iobitmap.nr_ports = + next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; + queue_multicall1(__HYPERVISOR_physdev_op, + (unsigned long)&iobmp_op); } /* EXECUTE ALL TASK SWITCH XEN SYSCALLS AT THIS POINT. */ @@ -573,9 +543,6 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas loaddebug(next, 7); } - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) - handle_io_bitmap(next, tss); - return prev_p; } diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/setup.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/setup.c index c64816b36b..7715a35b68 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/setup.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/setup.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "setup_arch_pre.h" #include @@ -1399,9 +1400,9 @@ static void set_mca_bus(int x) { } */ void __init setup_arch(char **cmdline_p) { - int i,j; - - unsigned long max_low_pfn; + int i,j; + physdev_op_t op; + unsigned long max_low_pfn; /* Force a quick death if the kernel panics. */ extern int panic_timeout; @@ -1585,16 +1586,9 @@ void __init setup_arch(char **cmdline_p) register_memory(); - /* If we are a privileged guest OS then we should request IO privs. */ - if (xen_start_info.flags & SIF_PRIVILEGED) { - dom0_op_t op; - op.cmd = DOM0_IOPL; - op.u.iopl.domain = DOMID_SELF; - op.u.iopl.iopl = 1; - if (HYPERVISOR_dom0_op(&op) != 0) - panic("Unable to obtain IOPL, despite SIF_PRIVILEGED"); - current->thread.io_pl = 1; - } + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = current->thread.io_pl = 1; + HYPERVISOR_physdev_op(&op); if (xen_start_info.flags & SIF_INITDOMAIN) { if (!(xen_start_info.flags & SIF_PRIVILEGED)) diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c index 6a09d2d75c..4ffecb3883 100644 --- a/xen/arch/x86/dom0_ops.c +++ b/xen/arch/x86/dom0_ops.c @@ -136,14 +136,61 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op) } break; - case DOM0_IOPL: + case DOM0_IOPL_PERMISSION: { + struct domain *d; + + ret = -EINVAL; + if ( op->u.iopl_permission.max_iopl > 3 ) + break; + + ret = -ESRCH; + if ( unlikely((d = find_domain_by_id( + op->u.iopl_permission.domain)) == NULL) ) + break; + + ret = 0; + d->arch.max_iopl = op->u.iopl_permission.max_iopl; + + put_domain(d); + } + break; + + case DOM0_IOPORT_PERMISSION: + { + struct domain *d; + unsigned int fp = op->u.ioport_permission.first_port; + unsigned int np = op->u.ioport_permission.nr_ports; + unsigned int p; + ret = -EINVAL; - if ( op->u.iopl.domain == DOMID_SELF ) + if ( (fp + np) >= 65536 ) + break; + + ret = -ESRCH; + if ( unlikely((d = find_domain_by_id( + op->u.ioport_permission.domain)) == NULL) ) + break; + + ret = -ENOMEM; + if ( d->arch.iobmp_mask != NULL ) { - current->arch.iopl = op->u.iopl.iopl & 3; - ret = 0; + if ( (d->arch.iobmp_mask = xmalloc_array( + u8, IOBMP_BYTES)) == NULL ) + break; + memset(d->arch.iobmp_mask, 0xFF, IOBMP_BYTES); } + + ret = 0; + for ( p = fp; p < (fp + np); p++ ) + { + if ( op->u.ioport_permission.allow_access ) + clear_bit(p, d->arch.iobmp_mask); + else + set_bit(p, d->arch.iobmp_mask); + } + + put_domain(d); } break; diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 287532ac66..db2efe8a86 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -735,7 +735,6 @@ void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p) { struct tss_struct *tss = init_tss + smp_processor_id(); execution_context_t *stack_ec = get_execution_context(); - int i; __cli(); @@ -767,57 +766,33 @@ void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p) loaddebug(&next_p->arch, 7); } - if ( VMX_DOMAIN(next_p) ) + if ( !VMX_DOMAIN(next_p) ) { - write_ptbase(next_p); - set_current(next_p); - __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt)); - __sti(); - goto done; - } - - SET_FAST_TRAP(&next_p->arch); + SET_FAST_TRAP(&next_p->arch); #ifdef __i386__ - /* Switch the kernel ring-1 stack. */ - tss->esp1 = next_p->arch.kernel_sp; - tss->ss1 = next_p->arch.kernel_ss; + /* Switch the kernel ring-1 stack. */ + tss->esp1 = next_p->arch.kernel_sp; + tss->ss1 = next_p->arch.kernel_ss; #endif + } /* Switch page tables. */ write_ptbase(next_p); } - if ( unlikely(prev_p->arch.io_bitmap != NULL) ) - { - for ( i = 0; i < sizeof(prev_p->arch.io_bitmap_sel) * 8; i++ ) - if ( !test_bit(i, &prev_p->arch.io_bitmap_sel) ) - memset(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT], - ~0U, IOBMP_BYTES_PER_SELBIT); - tss->bitmap = IOBMP_INVALID_OFFSET; - } - - if ( unlikely(next_p->arch.io_bitmap != NULL) ) - { - for ( i = 0; i < sizeof(next_p->arch.io_bitmap_sel) * 8; i++ ) - if ( !test_bit(i, &next_p->arch.io_bitmap_sel) ) - memcpy(&tss->io_bitmap[i * IOBMP_BYTES_PER_SELBIT], - &next_p->arch.io_bitmap[i * IOBMP_BYTES_PER_SELBIT], - IOBMP_BYTES_PER_SELBIT); - tss->bitmap = IOBMP_OFFSET; - } - set_current(next_p); - /* Switch GDT and LDT. */ __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt)); - load_LDT(next_p); __sti(); - switch_segments(stack_ec, prev_p, next_p); + if ( !VMX_DOMAIN(next_p) ) + { + load_LDT(next_p); + switch_segments(stack_ec, prev_p, next_p); + } - done: /* * We do this late on because it doesn't need to be protected by the * schedule_lock, and because we want this to be the very last use of diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index b350229fd0..7ae035187d 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -312,7 +312,6 @@ void __init cpu_init(void) /* Set up and load the per-CPU TSS and LDT. */ t->bitmap = IOBMP_INVALID_OFFSET; - memset(t->io_bitmap, ~0, sizeof(t->io_bitmap)); #if defined(__i386__) t->ss0 = __HYPERVISOR_DS; t->esp0 = get_stack_bottom(); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 49d3a89086..d5ef2052f9 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -379,15 +379,50 @@ long do_fpu_taskswitch(int set) return 0; } -static inline int user_io_okay( +/* Has the guest requested sufficient permission for this I/O access? */ +static inline int guest_io_okay( unsigned int port, unsigned int bytes, struct exec_domain *ed, struct xen_regs *regs) { - if ( ed->arch.iopl < (KERNEL_MODE(ed, regs) ? 1 : 3) ) - return 0; - return 1; + u16 x; + if ( ed->arch.iopl >= (KERNEL_MODE(ed, regs) ? 1 : 3) ) + return 1; + if ( (ed->arch.iobmp_limit > (port + bytes)) && + (__get_user(x, (u16 *)(ed->arch.iobmp+(port>>3))) == 0) && + ((x & (((1<domain; + u16 x; + if ( IS_PRIV(d) || (d->arch.max_iopl >= (KERNEL_MODE(ed, regs) ? 1 : 3)) ) + return 1; + if ( d->arch.iobmp_mask != NULL ) + { + x = *(u16 *)(d->arch.iobmp_mask + (port >> 3)); + if ( (x & (((1<edx, op_bytes, ed, regs) ) + if ( !guest_io_okay((u16)regs->edx, op_bytes, ed, regs) ) goto fail; switch ( op_bytes ) { case 1: - data = (u8)inb((u16)regs->edx); + data = (u8)inb_user((u16)regs->edx, ed, regs); if ( put_user((u8)data, (u8 *)regs->edi) ) goto write_fault; break; case 2: - data = (u16)inw((u16)regs->edx); + data = (u16)inw_user((u16)regs->edx, ed, regs); if ( put_user((u16)data, (u16 *)regs->edi) ) goto write_fault; break; case 4: - data = (u32)inl((u16)regs->edx); + data = (u32)inl_user((u16)regs->edx, ed, regs); if ( put_user((u32)data, (u32 *)regs->edi) ) goto write_fault; break; @@ -476,24 +511,24 @@ static int emulate_privileged_op(struct xen_regs *regs) case 0x6e: /* OUTSB */ op_bytes = 1; case 0x6f: /* OUTSW/OUTSL */ - if ( !user_io_okay((u16)regs->edx, op_bytes, ed, regs) ) + if ( !guest_io_okay((u16)regs->edx, op_bytes, ed, regs) ) goto fail; switch ( op_bytes ) { case 1: if ( get_user(data, (u8 *)regs->esi) ) goto read_fault; - outb((u8)data, (u16)regs->edx); + outb_user((u8)data, (u16)regs->edx, ed, regs); break; case 2: if ( get_user(data, (u16 *)regs->esi) ) goto read_fault; - outw((u16)data, (u16)regs->edx); + outw_user((u16)data, (u16)regs->edx, ed, regs); break; case 4: if ( get_user(data, (u32 *)regs->esi) ) goto read_fault; - outl((u32)data, (u16)regs->edx); + outl_user((u32)data, (u16)regs->edx, ed, regs); break; } regs->esi += (regs->eflags & EF_DF) ? -op_bytes : op_bytes; @@ -518,20 +553,20 @@ static int emulate_privileged_op(struct xen_regs *regs) case 0xe5: /* IN imm8,%eax */ port = insn_fetch(u8, 1, eip); exec_in: - if ( !user_io_okay(port, op_bytes, ed, regs) ) + if ( !guest_io_okay(port, op_bytes, ed, regs) ) goto fail; switch ( op_bytes ) { case 1: regs->eax &= ~0xffUL; - regs->eax |= (u8)inb(port); + regs->eax |= (u8)inb_user(port, ed, regs); break; case 2: regs->eax &= ~0xffffUL; - regs->eax |= (u16)inw(port); + regs->eax |= (u16)inw_user(port, ed, regs); break; case 4: - regs->eax = (u32)inl(port); + regs->eax = (u32)inl_user(port, ed, regs); break; } goto done; @@ -547,18 +582,18 @@ static int emulate_privileged_op(struct xen_regs *regs) case 0xe7: /* OUT %eax,imm8 */ port = insn_fetch(u8, 1, eip); exec_out: - if ( !user_io_okay(port, op_bytes, ed, regs) ) + if ( !guest_io_okay(port, op_bytes, ed, regs) ) goto fail; switch ( op_bytes ) { case 1: - outb((u8)regs->eax, port); + outb_user((u8)regs->eax, port, ed, regs); break; case 2: - outw((u16)regs->eax, port); + outw_user((u16)regs->eax, port, ed, regs); break; case 4: - outl((u32)regs->eax, port); + outl_user((u32)regs->eax, port, ed, regs); break; } goto done; diff --git a/xen/common/domain.c b/xen/common/domain.c index 1fb58789f3..523b65430b 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -129,6 +129,12 @@ struct domain *find_last_domain(void) } +#ifndef CONFIG_IA64 +extern void physdev_destroy_state(struct domain *d); +#else +#define physdev_destroy_state(_d) ((void)0) +#endif + void domain_kill(struct domain *d) { struct exec_domain *ed; @@ -139,6 +145,7 @@ void domain_kill(struct domain *d) for_each_exec_domain(d, ed) sched_rem_domain(ed); domain_relinquish_memory(d); + physdev_destroy_state(d); put_domain(d); } } diff --git a/xen/common/physdev.c b/xen/common/physdev.c index 56beb12bd7..64319b5aab 100644 --- a/xen/common/physdev.c +++ b/xen/common/physdev.c @@ -69,26 +69,47 @@ typedef struct _phys_dev_st { /* Find a device on a per-domain device list. */ -static phys_dev_t *find_pdev(struct domain *p, struct pci_dev *dev) +static phys_dev_t *find_pdev(struct domain *d, struct pci_dev *dev) { - phys_dev_t *t, *res = NULL; + phys_dev_t *t; + list_for_each_entry ( t, &d->pcidev_list, node ) + if ( dev == t->dev ) + return t; + return NULL; +} + +static int setup_ioport_memory_access(struct domain *d, struct pci_dev *pdev) +{ + struct resource *r; + int i, j; - list_for_each_entry ( t, &p->pcidev_list, node ) + if ( d->arch.iobmp_mask == NULL ) { - if ( dev == t->dev ) + if ( (d->arch.iobmp_mask = xmalloc_array(u8, IOBMP_BYTES)) == NULL ) + return -ENOMEM; + memset(d->arch.iobmp_mask, 0xFF, IOBMP_BYTES); + } + + for ( i = 0; i < DEVICE_COUNT_RESOURCE; i++ ) + { + r = &pdev->resource[i]; + if ( r->flags & IORESOURCE_IO ) { - res = t; - break; + INFO("Giving domain %u IO resources (%lx - %lx) " + "for device %s\n", d->id, r->start, r->end, pdev->slot_name); + for ( j = r->start; j < r->end + 1; j++ ) + clear_bit(j, d->arch.iobmp_mask); } } - return res; + + return 0; } /* Add a device to a per-domain device-access list. */ -static int add_dev_to_task(struct domain *p, struct pci_dev *dev, - int acc) +static int add_dev_to_task(struct domain *d, struct pci_dev *dev, int acc) { phys_dev_t *physdev; + int rc; if ( (physdev = xmalloc(phys_dev_t)) == NULL ) { @@ -96,104 +117,56 @@ static int add_dev_to_task(struct domain *p, struct pci_dev *dev, return -ENOMEM; } + if ( (rc = setup_ioport_memory_access(d, dev)) < 0 ) + { + xfree(physdev); + return rc; + } + physdev->dev = dev; physdev->flags = acc; physdev->state = 0; - list_add(&physdev->node, &p->pcidev_list); + list_add(&physdev->node, &d->pcidev_list); if ( acc == ACC_WRITE ) - physdev->owner = p; + physdev->owner = d; return 0; } -/* Remove a device from a per-domain device-access list. */ -static void remove_dev_from_task(struct domain *p, struct pci_dev *dev) +void physdev_destroy_state(struct domain *d) { - phys_dev_t *physdev = find_pdev(p, dev); - - if ( physdev == NULL ) - BUG(); - - list_del(&physdev->node); - - xfree(physdev); -} + struct list_head *ent; -static int setup_ioport_memory_access(domid_t dom, struct domain* p, - struct exec_domain* ed, - struct pci_dev *pdev) -{ - struct exec_domain* edc; - int i, j; - - /* Now, setup access to the IO ports and memory regions for the device. */ - if ( ed->arch.io_bitmap == NULL ) + if ( d->arch.iobmp_mask != NULL ) { - if ( (ed->arch.io_bitmap = xmalloc_array(u8, IOBMP_BYTES)) == NULL ) - return -ENOMEM; - - memset(ed->arch.io_bitmap, 0xFF, IOBMP_BYTES); - - ed->arch.io_bitmap_sel = ~0ULL; - - for_each_exec_domain(p, edc) { - if (edc == ed) - continue; - edc->arch.io_bitmap = ed->arch.io_bitmap; - } + xfree(d->arch.iobmp_mask); + d->arch.iobmp_mask = NULL; } - for ( i = 0; i < DEVICE_COUNT_RESOURCE; i++ ) + while ( (ent = d->pcidev_list.next) != &d->pcidev_list ) { - struct resource *r = &pdev->resource[i]; - - if ( r->flags & IORESOURCE_IO ) - { - /* Give the domain access to the IO ports it needs. Currently, - * this will allow all processes in that domain access to those - * ports as well. This will do for now, since driver domains don't - * run untrusted processes! */ - INFO("Giving domain %u IO resources (%lx - %lx) " - "for device %s\n", dom, r->start, r->end, pdev->slot_name); - for ( j = r->start; j < r->end + 1; j++ ) - { - clear_bit(j, ed->arch.io_bitmap); - clear_bit(j / IOBMP_BITS_PER_SELBIT, &ed->arch.io_bitmap_sel); - } - } - /* rights to IO memory regions are checked when the domain maps them */ + list_del(ent); + xfree(list_entry(ent, phys_dev_t, node)); } - - for_each_exec_domain(p, edc) { - if (edc == ed) - continue; - edc->arch.io_bitmap_sel = ed->arch.io_bitmap_sel; - } - - return 0; } /* * physdev_pci_access_modify: * Allow/disallow access to a specific PCI device. Guests should not be * allowed to see bridge devices as it needlessly complicates things (one - * possible exception to this is the AGP bridge). If the given device is a - * bridge, then the domain should get access to all the leaf devices below - * that bridge (XXX this is unimplemented!). + * possible exception to this is the AGP bridge). */ int physdev_pci_access_modify(domid_t dom, int bus, int dev, int func, int enable) { struct domain *p; - struct exec_domain *ed; struct pci_dev *pdev; phys_dev_t *physdev; int rc = 0; - int oldacc = -1, allocated_physdev = 0; + int oldacc = -1; - if ( !IS_PRIV(current->domain) ) - BUG(); + BUG_ON(!IS_PRIV(current->domain)); if ( (bus > PCI_BUSMAX) || (dev > PCI_DEVMAX) || (func > PCI_FUNCMAX) ) return -EINVAL; @@ -209,8 +182,6 @@ int physdev_pci_access_modify(domid_t dom, int bus, int dev, int func, if ( (p = find_domain_by_id(dom)) == NULL ) return -ESRCH; - ed = p->exec_domain[0]; /* XXX */ - /* Make the domain privileged. */ set_bit(DF_PHYSDEV, &p->d_flags); /* FIXME: MAW for now make the domain REALLY privileged so that it @@ -222,47 +193,23 @@ int physdev_pci_access_modify(domid_t dom, int bus, int dev, int func, { INFO(" dev does not exist\n"); rc = -ENODEV; - goto clear_privilege; + goto out; } - if ( (physdev = find_pdev(p, pdev)) != NULL) { - /* Sevice already on list: update access permissions. */ - oldacc = physdev->flags; - physdev->flags = ACC_WRITE; - } else { - if ( (rc = add_dev_to_task(p, pdev, ACC_WRITE)) < 0) - goto clear_privilege; - allocated_physdev = 1; - } - INFO(" add RW %02x:%02x:%02x\n", pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); - /* Is the device a bridge or cardbus? */ - if ( pdev->hdr_type != PCI_HEADER_TYPE_NORMAL ) { - INFO("XXX can't give access to bridge devices yet\n"); - rc = -EPERM; - goto remove_dev; + if ( (physdev = find_pdev(p, pdev)) != NULL ) + { + oldacc = physdev->flags; + physdev->flags = ACC_WRITE; } - - if ( (rc = setup_ioport_memory_access(dom, p, ed, pdev)) < 0 ) - goto remove_dev; - - put_domain(p); - return rc; - -remove_dev: - if (allocated_physdev) { - /* new device was added - remove it from the list */ - remove_dev_from_task(p, pdev); - } else { - /* device already existed - just undo the access changes */ - physdev->flags = oldacc; + else + { + rc = add_dev_to_task(p, pdev, ACC_WRITE); } - -clear_privilege: - clear_bit(DF_PHYSDEV, &p->d_flags); - clear_bit(DF_PRIVILEGED, &p->d_flags); + + out: put_domain(p); return rc; } @@ -308,9 +255,8 @@ int domain_iomem_in_pfn(struct domain *p, unsigned long pfn) } /* check if a domain has general access to a device */ -inline static int check_dev_acc (struct domain *p, - int bus, int dev, int func, - phys_dev_t **pdev) +static inline int check_dev_acc( + struct domain *d, int bus, int dev, int func, phys_dev_t **pdev) { struct pci_dev *target_dev; phys_dev_t *target_pdev; @@ -318,10 +264,10 @@ inline static int check_dev_acc (struct domain *p, *pdev = NULL; - if ( !IS_CAPABLE_PHYSDEV(p) ) - return -EPERM; /* no pci access permission */ + if ( !IS_CAPABLE_PHYSDEV(d) ) + return -EPERM; - if ( bus > PCI_BUSMAX || dev > PCI_DEVMAX || func > PCI_FUNCMAX ) + if ( (bus > PCI_BUSMAX) || (dev > PCI_DEVMAX) || (func > PCI_FUNCMAX) ) return -EINVAL; VERBOSE_INFO("b=%x d=%x f=%x ", bus, dev, func); @@ -336,7 +282,7 @@ inline static int check_dev_acc (struct domain *p, } /* check access */ - target_pdev = find_pdev(p, target_dev); + target_pdev = find_pdev(d, target_dev); if ( !target_pdev ) { VERBOSE_INFO("dom has no access to target\n"); @@ -748,6 +694,24 @@ long do_physdev_op(physdev_op_t *uop) ret = 0; break; + case PHYSDEVOP_SET_IOPL: + ret = -EINVAL; + if ( op.u.set_iopl.iopl > 3 ) + break; + ret = 0; + current->arch.iopl = op.u.set_iopl.iopl; + break; + + case PHYSDEVOP_SET_IOBITMAP: + ret = -EINVAL; + if ( !access_ok(VERIFY_READ, op.u.set_iobitmap.bitmap, IOBMP_BYTES) || + (op.u.set_iobitmap.nr_ports > 65536) ) + break; + ret = 0; + current->arch.iobmp = (u8 *)op.u.set_iobitmap.bitmap; + current->arch.iobmp_limit = op.u.set_iobitmap.nr_ports; + break; + default: ret = -EINVAL; break; diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index ca4c192c13..b8838216cb 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -18,6 +18,10 @@ struct arch_domain l3_pgentry_t *mm_perdomain_l3; #endif + /* I/O-port access bitmap mask. */ + u8 *iobmp_mask; /* Address of IO bitmap mask, or NULL. */ + int max_iopl; /* Maximum achievable IOPL. */ + /* shadow mode status and controls */ unsigned int shadow_mode; /* flags to control shadow table operation */ spinlock_t shadow_lock; @@ -57,7 +61,6 @@ struct arch_exec_domain /* general user-visible register state */ execution_context_t user_ctxt; - unsigned int iopl; void (*schedule_tail) (struct exec_domain *); @@ -81,9 +84,9 @@ struct arch_exec_domain struct trap_bounce trap_bounce; /* I/O-port access bitmap. */ - u64 io_bitmap_sel; /* Selector to tell us which part of the IO bitmap are - * "interesting" (i.e. have clear bits) */ - u8 *io_bitmap; /* Pointer to task's IO bitmap or NULL */ + u8 *iobmp; /* Guest kernel virtual address of the bitmap. */ + int iobmp_limit; /* Number of ports represented in the bitmap. */ + int iopl; /* Current IOPL for this VCPU. */ /* Trap info. */ #ifdef ARCH_HAS_FAST_TRAP diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 52e584d2e2..8eb856de04 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -328,9 +328,6 @@ static inline void clear_in_cr4 (unsigned long mask) } while (0) #define IOBMP_BYTES 8192 -#define IOBMP_BYTES_PER_SELBIT (IOBMP_BYTES / 64) -#define IOBMP_BITS_PER_SELBIT (IOBMP_BYTES_PER_SELBIT * 8) -#define IOBMP_OFFSET offsetof(struct tss_struct, io_bitmap) #define IOBMP_INVALID_OFFSET 0x8000 struct i387_state { @@ -372,9 +369,8 @@ struct tss_struct { u16 trace; #endif u16 bitmap; - u8 io_bitmap[IOBMP_BYTES+1]; - /* Pads the TSS to be cacheline-aligned (total size is 0x2080). */ - u8 __cacheline_filler[23]; + /* Pads the TSS to be cacheline-aligned (total size is 0x80). */ + u8 __cacheline_filler[24]; } __cacheline_aligned PACKED; #define IDT_ENTRIES 256 diff --git a/xen/include/public/dom0_ops.h b/xen/include/public/dom0_ops.h index b60b435941..ab1d6dc854 100644 --- a/xen/include/public/dom0_ops.h +++ b/xen/include/public/dom0_ops.h @@ -19,7 +19,7 @@ * This makes sure that old versions of dom0 tools will stop working in a * well-defined way (rather than crashing the machine, for instance). */ -#define DOM0_INTERFACE_VERSION 0xAAAA1001 +#define DOM0_INTERFACE_VERSION 0xAAAA1002 /************************************************************************/ @@ -120,13 +120,6 @@ typedef struct { MEMORY_PADDING; } PACKED dom0_setdomaininfo_t; /* 16 bytes */ -#define DOM0_IOPL 14 -typedef struct { - domid_t domain; /* 0 */ - u16 __pad; - u32 iopl; /* 4 */ -} PACKED dom0_iopl_t; /* 8 bytes */ - #define DOM0_MSR 15 typedef struct { /* IN variables. */ @@ -414,6 +407,20 @@ typedef struct { u32 _pad0; } PACKED dom0_microcode_t; /* 16 bytes */ +#define DOM0_IOPL_PERMISSION 36 +typedef struct { + domid_t domain; /* 0: domain to be affected */ + u16 max_iopl; /* 2: new effective IOPL limit */ +} PACKED dom0_iopl_permission_t; /* 4 bytes */ + +#define DOM0_IOPORT_PERMISSION 37 +typedef struct { + domid_t domain; /* 0: domain to be affected */ + u16 first_port; /* 2: first port int range */ + u16 nr_ports; /* 4: size of port range */ + u16 allow_access; /* 6: allow or deny access to range? */ +} PACKED dom0_ioport_permission_t; /* 8 bytes */ + typedef struct { u32 cmd; /* 0 */ u32 interface_version; /* 4 */ /* DOM0_INTERFACE_VERSION */ @@ -429,7 +436,6 @@ typedef struct { dom0_setdomaininfo_t setdomaininfo; dom0_getdomaininfo_t getdomaininfo; dom0_getpageframeinfo_t getpageframeinfo; - dom0_iopl_t iopl; dom0_msr_t msr; dom0_debug_t debug; dom0_settime_t settime; @@ -449,6 +455,8 @@ typedef struct { dom0_read_memtype_t read_memtype; dom0_perfccontrol_t perfccontrol; dom0_microcode_t microcode; + dom0_iopl_permission_t iopl_permission; + dom0_ioport_permission_t ioport_permission; } PACKED u; } PACKED dom0_op_t; /* 80 bytes */ diff --git a/xen/include/public/physdev.h b/xen/include/public/physdev.h index 9ff3800afc..266dfc80c6 100644 --- a/xen/include/public/physdev.h +++ b/xen/include/public/physdev.h @@ -15,6 +15,8 @@ #define PHYSDEVOP_PCI_PROBE_ROOT_BUSES 3 #define PHYSDEVOP_IRQ_UNMASK_NOTIFY 4 #define PHYSDEVOP_IRQ_STATUS_QUERY 5 +#define PHYSDEVOP_SET_IOPL 6 +#define PHYSDEVOP_SET_IOBITMAP 7 /* Read from PCI configuration space. */ typedef struct { @@ -62,6 +64,19 @@ typedef struct { u32 flags; /* 4 */ } PACKED physdevop_irq_status_query_t; /* 8 bytes */ +typedef struct { + /* IN */ + u32 iopl; /* 0 */ +} PACKED physdevop_set_iopl_t; /* 4 bytes */ + +typedef struct { + /* IN */ + memory_t bitmap; /* 0 */ + MEMORY_PADDING; + u32 nr_ports; /* 8 */ + u32 __pad0; /* 12 */ +} PACKED physdevop_set_iobitmap_t; /* 16 bytes */ + typedef struct _physdev_op_st { u32 cmd; /* 0 */ @@ -72,6 +87,8 @@ typedef struct _physdev_op_st physdevop_pci_initialise_device_t pci_initialise_device; physdevop_pci_probe_root_buses_t pci_probe_root_buses; physdevop_irq_status_query_t irq_status_query; + physdevop_set_iopl_t set_iopl; + physdevop_set_iobitmap_t set_iobitmap; u8 __dummy[32]; } PACKED u; } PACKED physdev_op_t; /* 40 bytes */ -- 2.30.2